The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged on every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers’ and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
You as a Data scientist at Thera bank need to come up with a classification model that will help bank improve their services so that customers do not renounce their credit cards
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
#libraries to help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier)
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy.stats import randint as sp_randint
dataCC = pd.read_csv('BankChurners.csv')
#taking a copy of the dataset in case we need original data later on
data = dataCC.copy()
dataCC.head()
dataCC.tail()
dataCC.shape
dataCC.info()
#Just checking null count, even though we saw there are no nulls except for 'Unnamed: 21'
dataCC.isnull().sum().sort_values(ascending=False)
#Since 'Unnamed: 21' is an empty column, it can be dropped
dataCC.drop('Unnamed: 21',axis=1,inplace=True)
#Checking null count and shape again to confirm empty column is dropped
dataCC.isnull().sum().sort_values(ascending=False)
dataCC.shape
dataCC.nunique().sort_values(ascending=False)
#Dropping column CLIENTNUM
dataCC.drop('CLIENTNUM',axis=1,inplace=True)
#Checking shape again to make sure that CLIENTNUM is dropped
dataCC.shape
#This gives the summary for numeric (int/float) columns
dataCC.describe().T
#This gives the summary for non-numeric (object) columns
dataCC.describe(include=['object']).T
dataCC.duplicated().sum()
cat_cols = ['Contacts_Count_12_mon','Months_Inactive_12_mon','Education_Level','Income_Category',
'Dependent_count','Total_Relationship_Count','Card_Category','Marital_Status','Gender','Attrition_Flag']
for col in cat_cols:
print(dataCC[col].value_counts())
print('-'*50)
#Converting object types to category
for col in dataCC.columns:
if dataCC[col].dtype == 'object':
dataCC[col] = dataCC[col].astype('category')
But these features can get more values if the dataset grows and hence going to treat them as numerical when building models
#checking info() again to see the data types
dataCC.info()
# Function to create boxplot and histogram for any input numerical variable
# This function takes the numerical column as the input and returns the boxplots
# and histograms for the variable.
def histogram_boxplot(feature, figsize=(15,10), bins = None):
""" Boxplot and histogram combined
feature: 1-d feature array
figsize: size of fig (default (9,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
sharex = True, # x-axis will be shared among all subplots
gridspec_kw = {"height_ratios": (.25, .75)},
figsize = figsize
) # creating the 2 subplots
sns.boxplot(feature, ax=ax_box2, showmeans=True, color='violet') # boxplot will be created and a star will indicate the mean value of the column
sns.distplot(feature, kde=F, ax=ax_hist2, bins=bins,palette="winter") if bins else sns.distplot(feature, kde=False, ax=ax_hist2) # For histogram
ax_hist2.axvline(np.mean(feature), color='green', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
#Customer_Age
histogram_boxplot(dataCC['Customer_Age'])
dataCC[dataCC['Customer_Age'] > 65 ]['Customer_Age'].sort_values(ascending=False)
#Months_on_book
histogram_boxplot(dataCC['Months_on_book'])
dataCC[dataCC['Months_on_book'] >= 50 ]['Months_on_book'].sort_values(ascending=False)
dataCC[dataCC['Months_on_book'] < 20 ]['Months_on_book'].sort_values(ascending=True)
#Credit_Limit
histogram_boxplot(dataCC['Credit_Limit'])
dataCC[dataCC['Credit_Limit'] > 25000 ]['Credit_Limit'].sort_values(ascending=False)
#Total_Revolving_Bal
histogram_boxplot(dataCC['Total_Revolving_Bal'])
#Avg_Open_To_Buy
histogram_boxplot(dataCC['Avg_Open_To_Buy'])
dataCC[dataCC['Credit_Limit'] >= 30000 ]['Credit_Limit'].sort_values(ascending=False)
#Total_Amt_Chng_Q4_Q1
histogram_boxplot(dataCC['Total_Amt_Chng_Q4_Q1'])
dataCC[dataCC['Total_Amt_Chng_Q4_Q1'] > 2.0 ]['Total_Amt_Chng_Q4_Q1'].sort_values(ascending=False)
dataCC[dataCC['Total_Amt_Chng_Q4_Q1'] < 0.2 ]['Total_Amt_Chng_Q4_Q1'].sort_values(ascending=True)
#Total_Trans_Amt
histogram_boxplot(dataCC['Total_Trans_Amt'])
dataCC[(dataCC['Total_Trans_Amt'] > 7500) & (dataCC['Total_Trans_Amt'] < 10000) ]['Total_Trans_Amt'].sort_values(ascending=False)
dataCC[(dataCC['Total_Trans_Amt'] > 12500) & (dataCC['Total_Trans_Amt'] < 17000) ]['Total_Trans_Amt'].sort_values(ascending=False)
#Total_Trans_Ct
histogram_boxplot(dataCC['Total_Trans_Ct'])
dataCC[dataCC['Total_Trans_Ct'] > 135 ]['Total_Trans_Ct'].sort_values(ascending=False)
#Total_Ct_Chng_Q4_Q1
histogram_boxplot(dataCC['Total_Ct_Chng_Q4_Q1'])
dataCC[dataCC['Total_Ct_Chng_Q4_Q1'] > 2.0 ]['Total_Ct_Chng_Q4_Q1'].sort_values(ascending=False)
dataCC[dataCC['Total_Ct_Chng_Q4_Q1'] < 0.2 ]['Total_Ct_Chng_Q4_Q1'].sort_values(ascending=True)
#Avg_Utilization_Ratio
histogram_boxplot(dataCC['Avg_Utilization_Ratio'])
#Function to plot barchart with percentage for categorical variables
def perc_on_bar(feature):
'''
plot
feature: categorical feature
the function won't work if a column is passed in hue parameter
'''
#Creating a countplot for the feature
sns.set(rc={'figure.figsize':(15,7)})
ax=sns.countplot(x=feature, data=dataCC)
total = len(feature) # length of the column
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.25 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size = 14) # annotate the percantage
plt.show() # show the plot
#Dependent_count
perc_on_bar(dataCC['Dependent_count'])
#Total_Relationship_Count
perc_on_bar(dataCC['Total_Relationship_Count'])
#Months_Inactive_12_mon
perc_on_bar(dataCC['Months_Inactive_12_mon'])
#Contacts_Count_12_mon
perc_on_bar(dataCC['Contacts_Count_12_mon'])
#Education_Level
perc_on_bar(dataCC['Education_Level'])
#Income_Category
perc_on_bar(dataCC['Income_Category'])
#Card_Category
perc_on_bar(dataCC['Card_Category'])
#Marital_Status
perc_on_bar(dataCC['Marital_Status'])
#Gender
perc_on_bar(dataCC['Gender'])
#Attrition_Flag
perc_on_bar(dataCC['Attrition_Flag'])
#Converting Attrition_Flag to 1 and 0 and chaging to integer
attrition = {'Existing Customer':0, 'Attrited Customer':1}
dataCC['Attrition_Flag']=dataCC['Attrition_Flag'].map(attrition)
dataCC['Attrition_Flag'] = dataCC['Attrition_Flag'].astype('int64')
#Heatmap to visualize the relationship between non-categorical variables
fig, ax = plt.subplots(figsize=(10,7));
sns.heatmap(dataCC.corr(),annot=True,linewidths=0.1,cmap='YlGnBu',fmt='.2f');
#Pairplot for the dataset
sns.pairplot(data=dataCC, hue='Attrition_Flag');
#Converting Attrition_Flag back to original values and type category
attrition = {0:'Existing Customer', 1:'Attrited Customer'}
dataCC['Attrition_Flag']=dataCC['Attrition_Flag'].map(attrition)
dataCC['Attrition_Flag'] = dataCC['Attrition_Flag'].astype('category')
#Bivariate Analysis - let us look at how target variable (Attrition_Flag) is distributed amongst continuous variables
numeric_cols = ['Customer_Age','Months_on_book','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio','Attrition_Flag']
dataCC[numeric_cols].groupby(['Attrition_Flag']).mean()
dataCC[numeric_cols].groupby(['Attrition_Flag']).median()
Customer_Age
Months_on_book
Credit_Limit (has slight diff, but not much)
Avg_Open_To_Buy
Total_Amt_Chng_Q4_Q1
Total_Ct_Chng_Q4_Q1
#Bivariate Analysis - let us look at how target variable (Attrition_Flag) is distributed amongst categorical variables
pd.crosstab(dataCC['Dependent_count'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Total_Relationship_Count'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Months_Inactive_12_mon'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Contacts_Count_12_mon'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Education_Level'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Income_Category'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Card_Category'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Marital_Status'],dataCC['Attrition_Flag'],normalize='index')
pd.crosstab(dataCC['Gender'],dataCC['Attrition_Flag'],normalize='index')
#Months_on_book Vs Customer_Age
sns.lineplot(y=dataCC.Months_on_book, x=dataCC.Customer_Age,hue=dataCC.Attrition_Flag);
#Credit_Limit Vs Avg_Open_To_Buy
sns.lineplot(y=dataCC.Credit_Limit, x=dataCC.Avg_Open_To_Buy,hue=dataCC.Attrition_Flag);
#Credit_Limit Vs Avg_Utilization_Ratio
sns.lineplot(y=dataCC.Credit_Limit, x=dataCC.Avg_Utilization_Ratio,hue=dataCC.Attrition_Flag);
#Avg_Open_To_Buy Vs Avg_Utilization_Ratio
sns.lineplot(y=dataCC.Avg_Open_To_Buy, x=dataCC.Avg_Utilization_Ratio,hue=dataCC.Attrition_Flag);
#Total_Revolving_Bal Vs Avg_Utilization_Ratio
sns.lineplot(y=dataCC.Total_Revolving_Bal, x=dataCC.Avg_Utilization_Ratio,hue=dataCC.Attrition_Flag);
#Total_Trans_Amt Vs Total_Trans_Ct
sns.lineplot(y=dataCC.Total_Trans_Amt, x=dataCC.Total_Trans_Ct,hue=dataCC.Attrition_Flag);
#Bivariate plot for numeric variables and target variable
cols = dataCC[
[
"Customer_Age",
"Months_on_book",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio"
]
].columns.tolist()
plt.figure(figsize=(10, 20))
for i, variable in enumerate(cols):
plt.subplot(5, 2, i + 1)
sns.boxplot(dataCC["Attrition_Flag"], dataCC[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
#Function to get count plot with hue, that shows % among each class
#Shows crosstab again for better readability and to show % within class and different classes altogether
def perc_on_bar_with_hue(x):
tab1 = pd.crosstab(dataCC[x],dataCC['Attrition_Flag'],margins=True)
print(tab1)
print('-'*80)
df1 = dataCC.groupby(x)['Attrition_Flag'].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue='Attrition_Flag',kind='bar',data=df1,aspect=2.0)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
#print(p)
height = p.get_height()
if np.isnan(height):
height = 0
txt = str(round(height,2)) + '%'
txt_x = p.get_x()
txt_y = height
g.ax.text(txt_x,txt_y,txt)
#Dependent_count Vs Attrition_Flag
perc_on_bar_with_hue('Dependent_count')
#Total_Relationship_Count Vs Attrition_Flag
perc_on_bar_with_hue('Total_Relationship_Count')
#Months_Inactive_12_mon Vs Attrition_Flag
perc_on_bar_with_hue('Months_Inactive_12_mon')
#Contacts_Count_12_mon Vs Attrition_Flag
perc_on_bar_with_hue('Contacts_Count_12_mon')
#Education_Level Vs Attrition_Flag
perc_on_bar_with_hue('Education_Level')
#Income_Category Vs Attrition_Flag
perc_on_bar_with_hue('Income_Category')
#Card_Category Vs Attrition_Flag
perc_on_bar_with_hue('Card_Category')
#Marital_Status Vs Attrition_Flag
perc_on_bar_with_hue('Marital_Status')
#Gender Vs Attrition_Flag
perc_on_bar_with_hue('Gender')
% of Attrited Vs Existing customers seem to be almost same across Male and Female (there is very slight % increase in Female)
#Change target variable (Attrition_Flag) to 1 and 0
#Attrition_Flag: Internal event (customer activity) variable - if the account is closed then 1 else 0
attrition = {'Existing Customer':0, 'Attrited Customer':1}
dataCC['Attrition_Flag']=dataCC['Attrition_Flag'].map(attrition)
#Checking columns that contain zeros
for col in dataCC.drop('Attrition_Flag',axis=1).columns:
if (dataCC[col] == 0).any() == True:
print('{} has one or more zeros'.format(col))
#For KNN to work, the columns for which we do KNN imputation need to be converted to numeric
#After KNN imputation for missing values is done, reverse mapping can be done to restore original values
education = {'Uneducated':1, 'High School':2, 'College':3, 'Graduate':4, 'Post-Graduate':5, 'Doctorate':6}
dataCC['Education_Level']=dataCC['Education_Level'].map(education)
income = {'Less than $40K':1, '$40K - $60K':2, '$60K - $80K':3, '$80K - $120K':4, '$120K +':5}
dataCC['Income_Category']=dataCC['Income_Category'].map(income)
marital_status = {'Married':1,'Single':2, 'Divorced':3}
dataCC['Marital_Status']=dataCC['Marital_Status'].map(marital_status)
#Checking nulls
dataCC.isnull().sum().sort_values(ascending=False)
# Separating target variable and other variables
X = dataCC.drop(columns='Attrition_Flag',axis=1)
Y = dataCC['Attrition_Flag'].astype('category')
#Dropping Avg_Open_To_Buy, since Credit limit shows 100% correlation with Avg_open_to_buy
X.drop('Avg_Open_To_Buy',axis=1,inplace=True)
#Dropping Total_Trans_Ct, since Total_Trans_Ct shows 81% correlation with Total_Trans_Amt
X.drop('Total_Trans_Ct',axis=1,inplace=True)
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
#checking target variable imbalance in train and test set
print('Existing customer % in training dataset is: {}'.format((y_train == 0).sum()/y_train.count()) )
print('Attrited customer % in training dataset is: {}'.format((y_train == 1).sum()/y_train.count()) )
print('Existing customer % in test dataset is: {}'.format((y_test == 0).sum()/y_test.count()) )
print('Attrited customer % in test dataset is: {}'.format((y_test == 1).sum()/y_test.count()) )
imputer = KNNImputer(n_neighbors=5)
reqd_col_for_impute = ['Education_Level','Income_Category','Marital_Status']
#Fit and transform the train data
X_train[reqd_col_for_impute]=imputer.fit_transform(X_train[reqd_col_for_impute])
#Transform the test data
X_test[reqd_col_for_impute]=imputer.transform(X_test[reqd_col_for_impute])
#Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print('-'*30)
print(X_test.isna().sum())
## Function to inverse the encoding
def inverse_mapping(x,y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype('category')
X_test[y] = np.round(X_test[y]).map(inv_dict).astype('category')
inverse_mapping(education,'Education_Level')
inverse_mapping(income,'Income_Category')
inverse_mapping(marital_status,'Marital_Status')
#Checking inverse mapped values/categories on training dataset
for i in reqd_col_for_impute:
print(X_train[i].value_counts())
print('*'*30)
#Checking inverse mapped values/categories on test dataset
for i in reqd_col_for_impute:
print(X_test[i].value_counts())
print('*'*30)
X_train=pd.get_dummies(X_train,drop_first=True)
X_test=pd.get_dummies(X_test,drop_first=True)
print(X_train.shape, X_test.shape)
X_train.info()
True Positives:
True Negatives:
False Positives:
False Negatives:
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
def get_metrics_score(model,train,test,train_y,test_y,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list=[]
pred_train = model.predict(train)
pred_test = model.predict(test)
train_acc = model.score(train,train_y)
test_acc = model.score(test,test_y)
train_recall = metrics.recall_score(train_y,pred_train)
test_recall = metrics.recall_score(test_y,pred_test)
train_precision = metrics.precision_score(train_y,pred_train)
test_precision = metrics.precision_score(test_y,pred_test)
score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(train,train_y))
print("Accuracy on test set : ",model.score(test,test_y))
print("Recall on training set : ",metrics.recall_score(train_y,pred_train))
print("Recall on test set : ",metrics.recall_score(test_y,pred_test))
print("Precision on training set : ",metrics.precision_score(train_y,pred_train))
print("Precision on test set : ",metrics.precision_score(test_y,pred_test))
return score_list # returning the list with train and test scores
#Function to display confusion matrix
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
lr = LogisticRegression(random_state=1)
lr.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_bfr=cross_val_score(estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_bfr)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(lr,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(lr,y_test)
print("Before UpSampling, counts of label 'Existing': {}".format(sum(y_train==0)))
print("Before UpSampling, counts of label 'Attrited': {} \n".format(sum(y_train==1)))
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Existing': {}".format(sum(y_train_over==0)))
print("After UpSampling, counts of label 'Attrited': {} \n".format(sum(y_train_over==1)))
print('After UpSampling, the shape of train_X: {}'.format(X_train_over.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_over.shape))
log_reg_over = LogisticRegression(random_state = 1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over,y_train_over)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_over=cross_val_score(estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold)
print(cv_result_over)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(log_reg_over,X_train_over,X_test,y_train_over,y_test)
# creating confusion matrix
make_confusion_matrix(log_reg_over,y_test)
Performance on the training set improved but the model is showing a clear difference b/w train and test scores
False negative came down to 5.6%, compared to previous model, which is good
True positives which are correct positive predictions increased to 10%, compared to previous model, which is also good
Model is overfitting, train/test scores have a clear difference.
We can try both these options:
a) Regularization to see if overfitting can be reduced
b) Undersampling the train set to handle the imbalance between classes and check the model performance.
# Choose the type of classifier.
#LogisticRegression has a parameter 'penalty', default values is l2. SAGA solver supports l1 penalty.
#l1 is Lasso Regression and l2 is Ridge Regression
#Since we use SAGA solver, we are implementing Lasso
lr_estimator = LogisticRegression(random_state=1,solver='saga')
# Grid of parameters to choose from
parameters = {'C': np.arange(0.1,1.1,0.1)}
# Run the grid search
grid_obj = GridSearchCV(lr_estimator, parameters, scoring='recall')
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
lr_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
lr_estimator.fit(X_train_over, y_train_over)
#Calculating different metrics
get_metrics_score(lr_estimator,X_train_over,X_test,y_train_over,y_test)
# creating confusion matrix
make_confusion_matrix(lr_estimator,y_test)
Over-fitting has reduced to a good extent, test recall is lower compared to previous model, but train and test recall are close now
False negative has come down to 7%, compared to base model
True positives which are correct positive predictions increased to 9%, compared to base model
We can try downsampling now
rus = RandomUnderSampler(random_state = 1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Existing': {}".format(sum(y_train==0)))
print("Before Under Sampling, counts of label 'Attrited': {} \n".format(sum(y_train==1)))
print("After Under Sampling, counts of label 'Existing': {}".format(sum(y_train_un==0)))
print("After Under Sampling, counts of label 'Attrited': {} \n".format(sum(y_train_un==1)))
print('After Under Sampling, the shape of train_X: {}'.format(X_train_un.shape))
print('After Under Sampling, the shape of train_y: {} \n'.format(y_train_un.shape))
log_reg_under = LogisticRegression(random_state = 1)
log_reg_under.fit(X_train_un,y_train_un )
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_under=cross_val_score(estimator=log_reg_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold)
print(cv_result_under)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(log_reg_under,X_train_un,X_test,y_train_un,y_test)
# creating confusion matrix
make_confusion_matrix(log_reg_under,y_test)
log_odds = log_reg_under.coef_[0]
df = pd.DataFrame(log_odds, X_train_un.columns, columns=['coef'])
df.reindex(df.coef.abs().sort_values(ascending=False).index)
odds = np.exp(log_reg_under.coef_[0])-1
pd.set_option('display.max_rows',None)
df=pd.DataFrame(odds, X_train_un.columns, columns=['odds'])
df.reindex(df.odds.abs().sort_values(ascending=False).index)
Contacts_Count_12_mon: For one-unit increase in the number of contacts in last 12 months, we expect to see about a 52% increase in the odds of a customer leaving credit card services
Months_Inactive_12_mon: For one-unit increase in the number of inactive months in last 12 months, we expect to see about a 35% increase in the odds of a customer leaving credit card services
Total_Relationship_Count: For one-unit increase in the number of products owned by a customer, we expect to see about a 29% decrease in the odds of a customer leaving credit card services
Dependent_count: For one-unit increase in the number of dependents, we expect to see about a 12% increase in the odds of a customer leaving credit card services
Total_Ct_Chng_Q4_Q1: For one-unit increase in the number of transactions between Q1 and Q4, we expect to see about a 11% decrease in the odds of a customer leaving credit card services
All other features can be interpretted in the same way
We will try other models and see the feature importance
Stratified K-Folds cross-validator is used, which will split data in train/validation sets. Split dataset into k consecutive folds keeping distribution of both classes in each fold same as the target variable. Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
This is better than splitting the data into just 3 sets - train,validation and test
Since I want to apply StandardScaler() on the dataset and fit the models, I am using pipeline() to do both together
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
dtree = Pipeline(
steps=[
("scaler", StandardScaler()),
("d_tree", DecisionTreeClassifier(criterion='gini',class_weight={0:0.16,1:0.84},random_state=1))
]
)
dtree.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_dtree=cross_val_score(estimator=dtree, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_dtree)
results.append(cv_result_dtree)
names.append('DTREE')
print("{}: {}".format('Mean recall score for DecisionTreeClassifier is', cv_result_dtree.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(dtree,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(dtree,y_test)
feature_names = X_train.columns
importances = dtree[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
rf = Pipeline(
steps=[
("scaler", StandardScaler()),
("random_forest", RandomForestClassifier(random_state=1))
]
)
rf.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_rf=cross_val_score(estimator=rf, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_rf)
results.append(cv_result_rf)
names.append('RF')
print("{}: {}".format('Mean recall score for RandomForestClassifier is', cv_result_rf.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(rf,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(rf,y_test)
feature_names = X_train.columns
importances = rf[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
bc = Pipeline(
steps=[
("scaler", StandardScaler()),
("bagging", BaggingClassifier(random_state=1))
]
)
bc.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_bc=cross_val_score(estimator=bc, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_bc)
results.append(cv_result_bc)
names.append('BC')
print("{}: {}".format('Mean recall score for BaggingClassifier is', cv_result_bc.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(bc,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(bc,y_test)
xgb = Pipeline(
steps=[
("scaler", StandardScaler()),
("xgboost", XGBClassifier(random_state=1,eval_metric='logloss'))
]
)
xgb.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_xgb=cross_val_score(estimator=xgb, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_xgb)
results.append(cv_result_xgb)
names.append('XGB')
print("{}: {}".format('Mean recall score for XGBClassifier is', cv_result_xgb.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(xgb,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(xgb,y_test)
feature_names = X_train.columns
importances = xgb[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
abc = Pipeline(
steps=[
("scaler", StandardScaler()),
("adaboost", AdaBoostClassifier(random_state=1))
]
)
abc.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_abc=cross_val_score(estimator=abc, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_abc)
results.append(cv_result_abc)
names.append('ABC')
print("{}: {}".format('Mean recall score for AdaboostClassifier is', cv_result_abc.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(abc,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(abc,y_test)
feature_names = X_train.columns
importances = abc[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
gbm = Pipeline(
steps=[
("scaler", StandardScaler()),
("gradient_boosting", GradientBoostingClassifier(random_state=1))
]
)
gbm.fit(X_train,y_train)
scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_gbm=cross_val_score(estimator=gbm, X=X_train, y=y_train, scoring=scoring, cv=kfold)
print(cv_result_gbm)
results.append(cv_result_gbm)
names.append('GBM')
print("{}: {}".format('Mean recall score for GradientboostClassifier is', cv_result_gbm.mean() * 100))
#Checking the performance on test dataset
#Calculating different metrics
get_metrics_score(gbm,X_train,X_test,y_train,y_test)
# creating confusion matrix
make_confusion_matrix(gbm,y_test)
feature_names = X_train.columns
importances = gbm[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
dfResults = pd.DataFrame(results,index=names,columns=['CV_Score1','CV_Score2','CV_Score3','CV_Score4','CV_Score5'])
#sorting the CV results for each model (row wise) and showing the max CV score at the end (last column) and avg score
a = dfResults.values
a.sort(axis=1)
dfResults1 = pd.DataFrame(a, dfResults.index, dfResults.columns)
dfResults1['AVG_Score'] = dfResults1.mean(axis=1)
print(dfResults1)
#print(results)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Using pipelines with StandardScaler and each of the above model to tune the model using GridSearchCV and RandomizedSearchCV.
Since I have already used Pipeline() method to create pipelines in the above code, I am using make_pipeline() function here
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion='gini',class_weight={0:0.16,1:0.84},random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
'decisiontreeclassifier__max_depth': np.arange(1,10),
'decisiontreeclassifier__min_samples_leaf': np.arange(1,10,2),
'decisiontreeclassifier__max_leaf_nodes' : np.arange(1,10,2),
'decisiontreeclassifier__min_impurity_decrease': [0.0001,0.001,0.01],
'decisiontreeclassifier__max_features': np.arange(0.2, 0.7, 0.2)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
dtree_tuned1 = make_pipeline(
StandardScaler(),
DecisionTreeClassifier(
criterion='gini',class_weight={0:0.16,1:0.84},random_state=1,
max_depth=4,
max_leaf_nodes=9,
min_samples_leaf=1,
min_impurity_decrease=0.0001,
max_features=0.4
)
)
# Fit the model on training data
dtree_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(dtree_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(dtree_tuned1, y_test)
feature_names = X_train.columns
importances = dtree_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1,class_weight={0:0.16,1:0.84}))
# Parameter grid to pass in GridSearchCV
param_grid = {
"randomforestclassifier__max_depth": np.arange(1,10),
"randomforestclassifier__n_estimators": [150,200,250],
"randomforestclassifier__min_samples_leaf": np.arange(5, 10),
"randomforestclassifier__max_features": np.arange(0.2, 0.7, 0.1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
rf_tuned1 = make_pipeline(
StandardScaler(),
RandomForestClassifier(
class_weight={0:0.16,1:0.84},random_state=1,
max_features=0.6000000000000001,
min_samples_leaf=6,
n_estimators=150,
max_depth=8
)
)
# Fit the model on training data
rf_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(rf_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(rf_tuned1, y_test)
feature_names = X_train.columns
importances = rf_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), BaggingClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
'baggingclassifier__max_samples': np.arange(0.7,1.1,0.1),
'baggingclassifier__max_features': np.arange(0.7,1.1,0.1),
'baggingclassifier__n_estimators' : np.arange(10,60,10)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
bc_tuned1 = make_pipeline(
StandardScaler(),
BaggingClassifier(
random_state=1,
max_features=0.9999999999999999,
max_samples=0.8999999999999999,
n_estimators=50
)
)
# Fit the model on training data
bc_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(bc_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(bc_tuned1, y_test)
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), XGBClassifier(random_state=1,eval_metric='logloss'))
# Parameter grid to pass in GridSearchCV
param_grid = {
'xgbclassifier__n_estimators': np.arange(75,175,25),
'xgbclassifier__subsample':np.arange(0.7,1.1,0.1),
'xgbclassifier__learning_rate':[0.01,0.1,0.2,0.05],
#'xgbclassifier__gamma':np.arange(0,6,1),
'xgbclassifier__colsample_bytree':np.arange(0.5,1.0,0.2),
'xgbclassifier__colsample_bylevel':np.arange(0.5,1.0,0.2),
'xgbclassifier__scale_pos_weight':np.arange(5,7,1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
xgb_tuned1 = make_pipeline(
StandardScaler(),
XGBClassifier(
random_state=1,
eval_metric='logloss',
n_estimators=100,
subsample=0.7,
learning_rate=0.05,
#gamma=5,
colsample_bytree=0.7,
colsample_bylevel=0.8999999999999999,
scale_pos_weight=6
)
)
# Fit the model on training data
xgb_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(xgb_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(xgb_tuned1, y_test)
feature_names = X_train.columns
importances = xgb_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
abc_tuned1 = make_pipeline(
StandardScaler(),
AdaBoostClassifier(
random_state=1,
n_estimators=70,
learning_rate=1,
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1)
)
)
# Fit the model on training data
abc_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(abc_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(abc_tuned1, y_test)
feature_names = X_train.columns
importances = abc_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
'gradientboostingclassifier__n_estimators': np.arange(50,200,25),
'gradientboostingclassifier__subsample':np.arange(0.7,1.1,0.1),
'gradientboostingclassifier__max_features':np.arange(0.7,1.1,0.1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
gbm_tuned1 = make_pipeline(
StandardScaler(),
GradientBoostingClassifier(
random_state=1,
n_estimators=175,
subsample=0.7999999999999999,
max_features=0.8999999999999999,
init=AdaBoostClassifier(random_state=1)
)
)
# Fit the model on training data
gbm_tuned1.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(gbm_tuned1,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(gbm_tuned1, y_test)
feature_names = X_train.columns
importances = gbm_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),DecisionTreeClassifier(criterion='gini',class_weight={0:0.16,1:0.84},random_state=1))
#Parameter distribution to pass in RandomSearchCV
param_dist={
'decisiontreeclassifier__max_depth': np.arange(1,10),
'decisiontreeclassifier__min_samples_leaf': np.arange(1,10,2),
'decisiontreeclassifier__max_leaf_nodes' : np.arange(1,10,2),
'decisiontreeclassifier__min_impurity_decrease': [0.0001,0.001,0.01],
'decisiontreeclassifier__max_features': np.arange(0.2, 0.7, 0.2)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
dtree_tuned2 = make_pipeline(
StandardScaler(),
DecisionTreeClassifier(
criterion='gini',class_weight={0:0.16,1:0.84},random_state=1,
max_depth=9,
min_samples_leaf=3,
min_impurity_decrease=0.0001,
max_leaf_nodes=9,
max_features=0.6000000000000001
)
)
# Fit the model on training data
dtree_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(dtree_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(dtree_tuned2, y_test)
feature_names = X_train.columns
importances = dtree_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),RandomForestClassifier(random_state=1,class_weight={0:0.16,1:0.84}))
#Parameter distribution to pass in RandomSearchCV
param_dist= {
"randomforestclassifier__max_depth": np.arange(1,10),
"randomforestclassifier__n_estimators": [150,200,250],
"randomforestclassifier__min_samples_leaf": np.arange(5, 10),
"randomforestclassifier__max_features": np.arange(0.2, 0.7, 0.1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
rf_tuned2 = make_pipeline(
StandardScaler(),
RandomForestClassifier(
class_weight={0:0.16,1:0.84},
random_state=1,
max_features=0.6000000000000001,
min_samples_leaf=9,
n_estimators=150,
max_depth=9
)
)
# Fit the model on training data
rf_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(rf_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(rf_tuned2, y_test)
feature_names = X_train.columns
importances = rf_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),BaggingClassifier(random_state=1))
#Parameter distribution to pass in RandomSearchCV
param_dist= {
'baggingclassifier__max_samples': np.arange(0.7,1.1,0.1),
'baggingclassifier__max_features': np.arange(0.7,1.1,0.1),
'baggingclassifier__n_estimators' : np.arange(10,60,10)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
bc_tuned2 = make_pipeline(
StandardScaler(),
BaggingClassifier(
random_state=1,
max_features=0.9999999999999999,
max_samples=0.7999999999999999,
n_estimators=40
)
)
# Fit the model on training data
bc_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(bc_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(bc_tuned2, y_test)
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),XGBClassifier(random_state=1,eval_metric='logloss'))
#Parameter distribution to pass in RandomSearchCV
param_dist= {
'xgbclassifier__n_estimators': np.arange(75,175,25),
'xgbclassifier__subsample':np.arange(0.7,1.1,0.1),
'xgbclassifier__learning_rate':[0.01,0.1,0.2,0.05],
#'xgbclassifier__gamma':np.arange(0,6,1),
'xgbclassifier__colsample_bytree':np.arange(0.5,1.0,0.2),
'xgbclassifier__colsample_bylevel':np.arange(0.5,1.0,0.2),
'xgbclassifier__scale_pos_weight':np.arange(5,7,1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
xgb_tuned2 = make_pipeline(
StandardScaler(),
XGBClassifier(
random_state=1,
eval_metric='logloss',
n_estimators=75,
subsample=0.7,
learning_rate=0.1,
#gamma=5,
colsample_bytree=0.5,
colsample_bylevel=0.8999999999999999,
scale_pos_weight=6
)
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(xgb_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(xgb_tuned2, y_test)
feature_names = X_train.columns
importances = xgb_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),AdaBoostClassifier(random_state=1))
#Parameter distribution to pass in RandomSearchCV
param_dist= {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
abc_tuned2 = make_pipeline(
StandardScaler(),
AdaBoostClassifier(
random_state=1,
n_estimators=90,
learning_rate=0.2,
base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1)
)
)
# Fit the model on training data
abc_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(abc_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(abc_tuned2, y_test)
feature_names = X_train.columns
importances = abc_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1))
#Parameter distribution to pass in RandomSearchCV
param_dist= {
'gradientboostingclassifier__n_estimators': np.arange(50,200,25),
'gradientboostingclassifier__subsample':np.arange(0.7,1.1,0.1),
'gradientboostingclassifier__max_features':np.arange(0.7,1.1,0.1)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
# Creating new pipeline with best parameters
gbm_tuned2 = make_pipeline(
StandardScaler(),
GradientBoostingClassifier(
random_state=1,
n_estimators=175,
subsample=0.7999999999999999,
max_features=0.7999999999999999,
init=AdaBoostClassifier(random_state=1)
)
)
# Fit the model on training data
gbm_tuned2.fit(X_train, y_train)
# Calculating different metrics
get_metrics_score(gbm_tuned2,X_train,X_test,y_train,y_test)
# Creating confusion matrix
make_confusion_matrix(gbm_tuned2, y_test)
feature_names = X_train.columns
importances = gbm_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# defining list of model - logistic regression
models = [lr]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train,X_test,y_train,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of models - logistic regression over-sampling,regularization
models = [log_reg_over, lr_estimator]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_over,X_test,y_train_over,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of model - logistic regression undersampling
models = [log_reg_under]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_un,X_test,y_train_un,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of model - Bagging/Boosting without hyper parameter tuning
models = [dtree,rf,bc,xgb,abc,gbm]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_un,X_test,y_train_un,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of model - Bagging/Boosting with hyper parameter tuning using GridSearch
models = [dtree_tuned1,rf_tuned1,bc_tuned1,xgb_tuned1,abc_tuned1,gbm_tuned1]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_un,X_test,y_train_un,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of model - Bagging/Boosting with hyper parameter tuning using RandomSearch
models = [dtree_tuned2,rf_tuned2,bc_tuned2,xgb_tuned2,abc_tuned2,gbm_tuned2]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_un,X_test,y_train_un,y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
comparison_frame = pd.DataFrame(
{
'Model':['Logistic Regression',
'Logistic Regression - Oversampled data',
'Logistic Regression-Regularized-OversampledData',
'Logistic Regression - Undersampled data',
'Decision Tree Classifier',
'RandomForest Classifier',
'Bagging Classifier',
'XGBoost Classifier',
'Adaboost Classifier',
'GradientBoost Classifier',
'Decision Tree - GridSearch',
'RandomForest - GridSearch',
'Bagging - GridSearch',
'XGBoost - GridSearch',
'Adaboost - GridSearch',
'GradientBoost - GridSearch',
'Decision Tree - RandomSearch',
'RandomForest - RandomSearch',
'Bagging - RandomSearch',
'XGBoost - RandomSearch',
'Adaboost - RandomSearch',
'GradientBoost - RandomSearch'
],
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test
}
)
#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by="Test_Recall", ascending=False)
time_df = pd.DataFrame(
{
'Model':[
'Decision Tree',
'Random Forest',
'Bagging Classifier',
'XGBoost',
'Adaboost',
'GradientBoost'
],
'Grid Search': ['2min 49s','1h 7 min 19s','4min 15s','44min 32s','7min 3s','10min 41s'],
'Randomized Search': ['4.27s','5min 1s','1min 31s','1min 18s','2min 28s','3min 33s']
}
)
time_df